*===============================================================================
* Merge data and modify codes  
*===============================================================================
clear all
set more off

cd "$dta_files"

*===============================================================================
* 1a. Merge Eurostat data
*===============================================================================

use "Eurostat_web_1996_2011.dta", clear

merge 1:1 nuts year using "EYB_1968_1996_all.dta", nogen
merge 1:1 nuts year using "FOCUS_RR_labor.dta", nogen

gen country=substr(nuts,1,2)
keep if 	country=="UK"|country=="SE"|country=="NL"|country=="IT"| ///
		country=="FR"|country=="ES"||country=="DK"|country=="DE"| ///
		country=="CH"|country=="BE"|country=="AT"|country=="FI"

*===============================================================================
* 1b. Merge country data
*===============================================================================

#delim ;
foreach file in 
AT_labor
CA_CENS1971_labor
CA_labor
DK_labor
ES_labor
FI_labor
FR_labor
IT_labor
SE_labor
UK_CENS1971_labor
UK_CENS1981_labor
UK_CENS1991_labor
US_CENS1970_labor
US_labor
IPUMS_labor 
{ ; 
#delim cr
display "`file'"
merge 1:1 nuts year using `file'.dta, nogen

}

drop ts*

*===============================================================================
* 2. Compute min, max, count and median of variables over all sources
*===============================================================================
egen POP_median=rowmedian(POP*)
egen LF_median=rowmedian(LF*)
egen EMP_median=rowmedian(EMP*)
egen UNEMP_median=rowmedian(UNEMP*)
egen ue_rate_mean= rowmean(ue_rate*)

replace LF_median=UNEMP_median*100/ue_rate_mean if LF_median==.
replace UNEMP_median=LF_median*ue_rate_mean/100 if UNEMP_median==.
replace EMP_median=LF_median-UNEMP_median if EMP_median==.
replace LF_median=EMP_median+UNEMP_median if LF_median==.

keep year nuts region_name *_median *mean 

*===============================================================================
* 3. Clean Data 
*===============================================================================

*-------------------------------------------------------------------------------
* a. Rename codes (composite codes that are considered 
* to be rough proxies for NUTS2 codes in later periods)
*-------------------------------------------------------------------------------

replace nuts="DE73" if nuts=="DE724&DE73"
replace nuts="DE71&DE72" if nuts=="DE71&DE721&DE722&DE723&DE725"
replace nuts="DE91" if nuts=="DE911&DE912&DE91B&DE917&DE91A&DE916"|nuts=="DE911&DE912&DE91B&DE917"|nuts=="DE91A&DE925&DE926&DE918&DE916&DE919&DE915"
replace nuts="DE93" if nuts=="DE931&DE93A&DE934&DE935&DE933&DE938&DE914&DE913"|nuts=="DE932&DE939&DE937&DE93B&DE936"
replace nuts="DE94" if nuts=="DE944&DE94E&DE949&DE94B"|nuts=="DE94C&DE947&DE942&DE94H"|nuts=="DE94A&DE945&DE94G&DE946&DE943&DE94D&DE941&DE948&DE94F"
replace nuts="UKC&UKD1" if nuts=="UKC1&UKC2&UKD1"

collapse (mean) POP_ EMP_ UNEMP_ LF_ ue_rate_, by(nuts year)

tempfile append
save 	 `append.dta'

gen composite_nuts=""
replace composite_nuts="FR82&FR83" if nuts=="FR82"|nuts=="FR83"
replace composite_nuts="NL21&NL23" if nuts=="NL21"|nuts=="NL23"
replace composite_nuts="ITH1&ITH2" if nuts=="ITH1"|nuts=="ITH2"
replace composite_nuts="DE91&DE92" if nuts=="DE925&DE926&DE918&DE919&DE915"|nuts=="DE922&DE923&DE927&DE928&DE929"|nuts=="DE91"|nuts=="DE92"
replace composite_nuts="DE71&DE72" if nuts=="DE71"|nuts=="DE72"
drop if composite_nuts==""

tempfile precollapse
save `precollapse.dta'

collapse (sum) POP_ EMP_ UNEMP_ LF_, by(composite_nuts year)

tempfile postcollapse
save `postcollapse.dta'

use `precollapse.dta', clear

* Gen weights for mean collapse of ue_rate  
gen wgt = POP_median 
	replace wgt = 1 if wgt==.|wgt==0
	
collapse (mean) ue_rate_ [aw=wgt], by(composite_nuts year)

merge 1:1 composite_nuts year using `postcollapse.dta', assert(3) nogen

* replace zeros from previous "collapses" with "."
foreach v of var POP_ LF_ EMP_ UNEMP_  ue_rate_ {
	replace `v'=. if `v'==0
}

rename composite_nuts nuts
append using `append.dta'
collapse (mean) POP_ EMP_ UNEMP_ LF_ ue_rate, by(nuts year)

*-------------------------------------------------------------------------------
* b. Merge data for codes that are the same for nuts2 and nuts3 (eg DE3 and DE30)
*-------------------------------------------------------------------------------
tempfile append2
save 	 `append2.dta'

replace nuts="BE10" if nuts=="BE1"
replace nuts="DE30" if nuts=="DE3"
replace nuts="DE40" if nuts=="DE4"
replace nuts="DE50" if nuts=="DE5"
replace nuts="DE60" if nuts=="DE6"
replace nuts="DE80" if nuts=="DE8"
replace nuts="DEC0" if nuts=="DEC"
replace nuts="DEE0" if nuts=="DEE"
replace nuts="DEF0" if nuts=="DEF"
replace nuts="EL30" if nuts=="EL3"
replace nuts="ES30" if nuts=="ES3"
replace nuts="ES70" if nuts=="ES7"
replace nuts="FR10" if nuts=="FR1"
replace nuts="FR30" if nuts=="FR3"
replace nuts="UKN0" if nuts=="UKN"

collapse (mean) LF_ EMP_ UNEMP_ POP_ ue_rate_ , by(nuts year)

append using `append2.dta'
collapse (mean) POP_ EMP_ UNEMP_ LF_ ue_rate_, by(nuts year)

*-------------------------------------------------------------------------------
* c. Clean Data from previous collapses
*-------------------------------------------------------------------------------

* replace zeros from previous "collapses" with "."
foreach v of var POP_ LF_ EMP_ UNEMP_  ue_rate_ {
	replace `v'=. if `v'==0
}

*-------------------------------------------------------------------------------
* 4. Create NUTS-level flags and keep observations at NUTS 2 level and lower
*-------------------------------------------------------------------------------
gen country        = substr(nuts,1,2)
gen nuts_level	   = 0 if length(nuts)==2
replace nuts_level = 1 if length(nuts)==3
replace nuts_level = 2 if length(nuts)==4

* Assign composite regions NUTS 2 level 
replace nuts_level = 2 if nuts=="DE71&DE72"|nuts=="DE91&DE92"|nuts=="DEF"| ///
			nuts=="FR82&FR83"|nuts=="ITH1&ITH2"|nuts=="NL21&NL23"

replace nuts_level = 2 if country=="US"|country=="CA"
replace nuts_level = 1 if length(nuts)>6 & country=="US"
replace nuts_level = 0 if nuts=="Canada"| nuts=="Entire U.S." 

drop if nuts_level==. 

********************************************************************************

rename POP POP
rename LF LF 
rename EMP EMP
rename UNEMP UNEMP

rename ue_rate ue_rate

sort nuts year
order year nuts POP LF EMP UNEMP ue_rate
saveold "$dta_files/step100_all_labor_data", replace

